File set-up

Set working directory to current directory

if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

Load standard libraries and resolve conflicts

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')
## [conflicted] Will prefer dplyr::intersect over any other package

Read data

cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')
## Rows: 1560 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (20): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cq

Presence in db (n = 890)

make count table

cont_table = cq %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group, n_db) %>%
  unique() %>%
  # only keep high abundant circRNAs
  filter(count_group == 'count ≥ 5') %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
  # change nr of databases to binary
  mutate(n_db = ifelse(is.na(n_db), 0, 1)) %>%
  group_by(n_db) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-n_db)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("novel", "in_db")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##           fail     pass
## novel 16.23933 132.7607
## in_db 80.76067 660.2393

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 162.61, df = 1, p-value < 2.2e-16

OR

OR = (705/36)/(88/61)
OR
## [1] 13.57481

Canonical splicing (n = 722)

cont_table = cq %>%
  filter(count_group == 'count ≥ 5',
         !strand == 'unknown') %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ss_motif) %>%
  unique() %>%
  mutate(ss_can = ifelse(ss_motif == "AGNGT", 1, 0)) %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail')) %>%
  # change nr of databases to binary
  group_by(ss_can) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-ss_can)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("non_cannonical", "cannonical")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                    fail      pass
## non_cannonical 11.99169  99.00831
## cannonical     66.00831 544.99169

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 42.044, df = 1, p-value = 8.923e-11

OR

OR = (565/46)/(79/32)
OR
## [1] 4.975234

Known annotation (n = 891)

cont_table = cq %>%
  filter(count_group == "count ≥ 5") %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ENST) %>%
  unique() %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail'),
         ENST_group = ifelse(is.na(ENST), 'no_match', "match")) %>% 
  # change nr of databases to binary
  group_by(ENST_group) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-ENST_group)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("match", "no_match")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##               fail      pass
## match    87.637486 717.36251
## no_match  9.362514  76.63749

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 203.2, df = 1, p-value < 2.2e-16

OR

OR = (757/48)/(37/49)
OR
## [1] 20.8857

CircRNA detection tool approach (n = 798)

cont_table = cq %>%
  filter(count_group == "count ≥ 5") %>%
  left_join(read_tsv('../data/details/tool_annotation.txt')) %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, approach) %>%
  filter(!approach == 'integrative') %>%
  unique() %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail')) %>% 
  # change nr of databases to binary
  group_by(approach) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-approach)
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): tool, approach, tool_lt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "tool"
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("candidate-based", "segmented read-based")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                          fail     pass
## candidate-based      21.54762 159.4524
## segmented read-based 73.45238 543.5476

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 6.8785, df = 1, p-value = 0.008724

OR

OR = (170/11)/(533/84)
OR
## [1] 2.435613

BSJ count group (n = 1042)

cont_table = cq %>%
  filter(!count_group == "no_counts") %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group) %>%
  unique() %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail')) %>%
  # change nr of databases to binary
  group_by(count_group) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-count_group)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                fail     pass
## count < 5  19.83877 132.1612
## count ≥ 5 116.16123 773.8388

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 23.636, df = 1, p-value = 1.164e-06

OR

OR = (793/97)/(113/39)
OR
## [1] 2.821549